Lab 13 - Creating a detailed report with explanations from the NEON MAG data

Author

Madlyn

Open Libraries

library(tidyverse)
library(lubridate)
library(DT)
library(viridis)
library(janitor)
library(plotly)
library(respirometry)
library(ggplot2)
library(corrplot)
library(dplyr)
library(ggfortify)
library(vegan)

Upload Data

write.csv(NEON_soilMAGs_soilChem, file = "NEON_soilMAGs_soilChem.csv")
data <-read.csv("NEON_soilMAGs_soilChem.csv")

Environmental Drivers of Soil Microbial Communities Across US Sites

Distribution of Soil Chemistry Variables

soil_microbe <- read.csv("NEON_soilMAGs_soilChem.csv")

# Histogram of soil moisture
ggplot(soil_microbe, aes(x = soilMoisture)) +
  geom_histogram(binwidth = 2, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Soil Moisture",
       x = "Soil Moisture (%)",
       y = "Count") +
  theme_minimal()

# Histogram of soil temperature
ggplot(soil_microbe, aes(x = soilTemp)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(title = "Distribution of Soil Temperature",
       x = "Soil Temperature (°C)",
       y = "Count") +
  theme_minimal()

# Histogram of soil pH (water)
ggplot(soil_microbe, aes(x = soilInWaterpH)) +
  geom_histogram(binwidth = 0.2, fill = "forestgreen", color = "black") +
  labs(title = "Distribution of Soil pH (Water)",
       x = "pH (Water)",
       y = "Count") +
  theme_minimal()

# Histogram of soil pH (CaCl2)
ggplot(soil_microbe, aes(x = soilInCaClpH)) +
  geom_histogram(binwidth = 0.2, fill = "purple", color = "black") +
  labs(title = "Distribution of Soil pH (CaCl2)",
       x = "pH (CaCl2)",
       y = "Count") +
  theme_minimal()

Histograms

This data shows the overall range and frequency of soil chemistry values including moisture, temperature and pH. Here we can see that temperature and pH vary greatly and have a wider range, whereas soil moisture has less broad of the range lying primarily under 5%.

Boxplots by Site

# Boxplot of soil moisture by site
ggplot(soil_microbe, aes(x = site_ID, y = soilMoisture)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Soil Moisture by Site",
       x = "Site",
       y = "Soil Moisture (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Boxplot of soil temperature by site
ggplot(soil_microbe, aes(x = site_ID, y = soilTemp)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Soil Temperature by Site",
       x = "Site",
       y = "Soil Temperature (°C)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Boxplot of soil pH (water) by site
ggplot(soil_microbe, aes(x = site_ID, y = soilInWaterpH)) +
  geom_boxplot(fill = "forestgreen") +
  labs(title = "Soil pH (Water) by Site",
       x = "Site",
       y = "pH (Water)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Boxplots

These boxplots compare soil chemistry across different sites specifically. This highlights the variability that exists in some sites, and the commonality in others.

Correlation Heatmap

soil_vars <- soil_microbe %>%
  select(soilMoisture, soilTemp, soilInWaterpH, soilInCaClpH,
         average_coverage, gene_count, bin_completeness, bin_contamination)

# Option A: Drop rows with missing values
soil_clean <- na.omit(soil_vars)

# Option B (safer): Keep rows and calculate correlations pairwise
soil_clean <- soil_vars

cor_matrix <- cor(soil_clean, use = "pairwise.complete.obs")

library(corrplot)
corrplot(cor_matrix, method = "color", type = "upper",
         tl.col = "black", tl.srt = 45)

Heatmap

This shows the positive correlations in red and negative correlations in blue. Here we can see that there are some positive correlations and negative correlations, but most information is not definitive.

Scatterplots of Key Relationships

# Soil moisture vs microbial abundance (average coverage)
ggplot(soil_microbe, aes(x = soilTemp, y = gene_count)) +
  geom_point(color = "darkred", size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  labs(title = "Soil Temperature vs Gene Count",
       x = "Soil Temperature (°C)",
       y = "Gene Count") +
  theme_minimal()

# Soil pH (water) vs gene count
ggplot(soil_microbe, aes(x = soilInWaterpH, y = gene_count)) +
  geom_point(color = "forestgreen", size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  labs(title = "Soil pH (Water) vs Gene Count",
       x = "Soil pH (Water)",
       y = "Gene Count") +
  theme_minimal()

# Soil Moisture vs Bin Contamination
ggplot(soil_microbe, aes(x = soilMoisture, y = bin_contamination)) +
  geom_point(color = "brown", size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  labs(title = "Soil Moisture vs Genome Contamination",
       x = "Soil Moisture (%)",
       y = "Bin Contamination (%)") +
  theme_minimal()

# Soil Temperature vs Bin Completeness
ggplot(soil_microbe, aes(x = soilTemp, y = bin_completeness)) +
  geom_point(color = "darkgreen", size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  labs(title = "Soil Temperature vs Genome Completeness",
       x = "Soil Temperature (°C)",
       y = "Bin Completeness (%)") +
  theme_minimal()

Scatterplots

These scatterplots compare two variables directly to visualize these relationships. Unfortunately there doesn’t seem to be any significant trends.

Principal Component Analysis (PCA)

soil_vars <- soil_microbe %>%
  select(soilMoisture, soilTemp, soilInWaterpH, soilInCaClpH)

# Drop rows with NA or Inf
soil_clean <- soil_vars %>%
  filter(if_all(everything(), ~ !is.na(.) & is.finite(.)))

soil_scaled <- scale(soil_clean)
pca_result <- prcomp(soil_scaled, center = TRUE, scale. = TRUE)

# Extract PCA scores
pca_scores <- as.data.frame(pca_result$x)

# IMPORTANT: match row indices to cleaned data
pca_scores$site_ID <- soil_microbe$site_ID[as.numeric(rownames(pca_scores))]

ggplot(pca_scores, aes(x = PC1, y = PC2, color = site_ID)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "PCA of Soil Chemistry Variables",
       x = "PC1",
       y = "PC2") +
  theme_minimal()

PCA

PCA reduces many correlated soil variables (moisture, temperature, pH) into just a few axes. This helps to identify the main drivers of variation. Basically it creates linear combinations of all the variables and samples so we can compare them across sites in this case. Here we can see that most soil samples clump together, with samples from the TALL site showing differing results and consistency.

Geographic Mapping

# Base US map
us_map <- map_data("state")

# Plot soil moisture across sites
ggplot() +
  geom_polygon(data = us_map, aes(x = long, y = lat, group = group),
               fill = "gray90", color = "black") +
  geom_point(data = soil_microbe,
             aes(x = decimalLongitude, y = decimalLatitude,
                 color = soilMoisture),
             size = 3, alpha = 0.7) +
  scale_color_viridis_c(option = "plasma") +
  labs(title = "Soil Moisture Across US Sampling Sites",
       x = "Longitude", y = "Latitude", color = "Moisture (%)") +
  theme_minimal()

# Plot soil temperature across sites
ggplot() +
  geom_polygon(data = us_map, aes(x = long, y = lat, group = group),
               fill = "gray90", color = "black") +
  geom_point(data = soil_microbe,
             aes(x = decimalLongitude, y = decimalLatitude,
                 color = soilTemp),
             size = 3, alpha = 0.7) +
  scale_color_viridis_c(option = "magma") +
  labs(title = "Soil Temperature Across US Sampling Sites",
       x = "Longitude", y = "Latitude", color = "Temperature (°C)") +
  theme_minimal()

# Plot soil pH (water) across sites
ggplot() +
  geom_polygon(data = us_map, aes(x = long, y = lat, group = group),
               fill = "gray90", color = "black") +
  geom_point(data = soil_microbe,
             aes(x = decimalLongitude, y = decimalLatitude,
                 color = soilInWaterpH),
             size = 3, alpha = 0.7) +
  scale_color_viridis_c(option = "plasma") +
  labs(title = "Soil pH (Water) Across US Sampling Sites",
       x = "Longitude", y = "Latitude", color = "pH (Water)") +
  theme_minimal()

Maps

These maps utilize decimalLongitude and decimalLatitude to plot variables on the US map. Here we can see oil moisture is higher in the east coast, soil temperature is higher at lower latitudes, and soil pH is very diverse.

Microbial Quality Metrics

ggplot(soil_microbe, aes(x = bin_completeness, y = bin_contamination)) +
  geom_point(color = "steelblue", size = 3, alpha = 0.7) +
  geom_hline(yintercept = 10, linetype = "dashed", color = "red") +
  geom_vline(xintercept = 90, linetype = "dashed", color = "green") +
  labs(title = "Genome Completeness vs Contamination",
       x = "Completeness (%)",
       y = "Contamination (%)") +
  theme_minimal()

Metrics Plot

This plot shows us the quality of each of the samples. Overall this set is of medium quality. The implied threshold was that genome completeness was above 50% which tells us that at least half of the genome has been reconstructed. The majority of the data lies before the 90% line which is the distinguisher of “high-quality” MAGs. Additionally the threshold of contamination is below 10% generally, which reinforces medium-high quality of the data set.

Histogram of Genome Completeness and Contamination

ggplot(soil_microbe, aes(x = bin_completeness)) +
  geom_histogram(binwidth = 5, fill = "steelblue", color = "black", alpha = 0.7) +
  geom_vline(xintercept = 90, linetype = "dashed", color = "red") +
  labs(title = "Distribution of Genome Completeness",
       x = "Completeness (%)",
       y = "Number of Genomes") +
  theme_minimal()

ggplot(soil_microbe, aes(x = bin_contamination)) +
  geom_histogram(binwidth = 2, fill = "darkorange", color = "black", alpha = 0.7) +
  geom_vline(xintercept = 10, linetype = "dashed", color = "red") +
  labs(title = "Distribution of Genome Contamination",
       x = "Contamination (%)",
       y = "Number of Genomes") +
  theme_minimal()

Histogram

Here we plot the same data in a different format. As I mentioned previously the majority of the data falls below the 10% contamination threshold (good thing), as well as below the 90% completion threshold (not bad but not good?). In this representation of the data it is easier to visualize each of the variables independently. It is also easier to quantify.

PCA on Genome Quality + Soil Chemistry

vars <- soil_microbe %>%
  select(soilMoisture, soilTemp, soilInWaterpH, soilInCaClpH,
         bin_completeness, bin_contamination, gene_count)
# Drop columns with only one unique value
vars_clean <- vars[, sapply(vars, function(x) length(unique(x)) > 1)]

# Drop rows with missing values
vars_clean <- na.omit(vars_clean)
pca_result <- prcomp(vars_clean, scale. = TRUE)

summary(pca_result)   # variance explained
Importance of components:
                          PC1    PC2    PC3    PC4    PC5     PC6     PC7
Standard deviation     1.6377 1.1597 1.0171 0.8810 0.8172 0.69159 0.12849
Proportion of Variance 0.3831 0.1921 0.1478 0.1109 0.0954 0.06833 0.00236
Cumulative Proportion  0.3831 0.5753 0.7230 0.8339 0.9293 0.99764 1.00000
# Set larger text sizes before plotting
par(cex = 1.2)        # overall text size multiplier
par(cex.axis = 2.5)   # axis numbers
par(cex.lab = 2.5)    # axis titles
par(cex.main = 2.8)   # main title

# Now plot the biplot
biplot(pca_result, cex = 1.2)   # cex controls point/text size inside the plot